This group project is about predicting the outbreak of west nile virus in Chicago, IL. This virus is transmitted to humans by mosquitos. Therefore, traps has been set to collect moquitos. The traps then were tested to see whether the collected mosquitos have the virus or not. The information gathered from the mosquito traps such as the type of the mosquito along with other data such as location, date, weather, and whether there had been an spray of pesticide to kill the mosquitos in that location are used to predict the presence of a west nile virus carrying mosquito in a trap.

The train set contains inforamtion about years 2007, 2009, 2011, and 2013 and is used to predict the presence of wnv in years of 2008, 2010, 2012, and 2014.


In [54]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [55]:
#importing data, spray data for the test set (2008,2010,2012,2014) is not provided. Therefore, spray info is not used for the analysis
train = pd.read_csv('./assets/train.csv')
test =pd.read_csv('./assets/test.csv')
weather = pd.read_csv('./assets/weather.csv')
spray =pd.read_csv('./assets/spray.csv')
mapdata = np.loadtxt("./assets/mapdata_copyright_openstreetmap_contributors.txt")

In [56]:
spray.head()


Out[56]:
Date Time Latitude Longitude
0 2011-08-29 6:56:58 PM 42.391623 -88.089163
1 2011-08-29 6:57:08 PM 42.391348 -88.089163
2 2011-08-29 6:57:18 PM 42.391022 -88.089157
3 2011-08-29 6:57:28 PM 42.390637 -88.089158
4 2011-08-29 6:57:38 PM 42.390410 -88.088858

In [57]:
train[['Date','Trap','Species','Latitude','Longitude','NumMosquitos','WnvPresent']].head()


Out[57]:
Date Trap Species Latitude Longitude NumMosquitos WnvPresent
0 2007-05-29 T002 CULEX PIPIENS/RESTUANS 41.954690 -87.800991 1 0
1 2007-05-29 T002 CULEX RESTUANS 41.954690 -87.800991 1 0
2 2007-05-29 T007 CULEX RESTUANS 41.994991 -87.769279 1 0
3 2007-05-29 T015 CULEX PIPIENS/RESTUANS 41.974089 -87.824812 1 0
4 2007-05-29 T015 CULEX RESTUANS 41.974089 -87.824812 4 0

In [58]:
weather.head()


Out[58]:
Station Date Tmax Tmin Tavg Depart DewPoint WetBulb Heat Cool ... CodeSum Depth Water1 SnowFall PrecipTotal StnPressure SeaLevel ResultSpeed ResultDir AvgSpeed
0 1 2007-05-01 83 50 67 14 51 56 0 2 ... 0 M 0.0 0.00 29.10 29.82 1.7 27 9.2
1 2 2007-05-01 84 52 68 M 51 57 0 3 ... M M M 0.00 29.18 29.82 2.7 25 9.6
2 1 2007-05-02 59 42 51 -3 42 47 14 0 ... BR 0 M 0.0 0.00 29.38 30.09 13.0 4 13.4
3 2 2007-05-02 60 43 52 M 42 47 13 0 ... BR HZ M M M 0.00 29.44 30.08 13.3 2 13.4
4 1 2007-05-03 66 46 56 2 40 48 9 0 ... 0 M 0.0 0.00 29.39 30.12 11.7 7 11.9

5 rows × 22 columns


In [59]:
train.Trap.unique()


Out[59]:
array(['T002', 'T007', 'T015', 'T045', 'T046', 'T048', 'T049', 'T050',
       'T054', 'T086', 'T091', 'T094', 'T096', 'T129', 'T143', 'T148',
       'T153', 'T159', 'T009', 'T011', 'T016', 'T019', 'T025', 'T028',
       'T031', 'T033', 'T089', 'T090', 'T092', 'T135', 'T141', 'T142',
       'T145', 'T146', 'T147', 'T149', 'T150', 'T151', 'T152', 'T154',
       'T158', 'T162', 'T218', 'T220', 'T001', 'T003', 'T006', 'T008',
       'T012', 'T034', 'T037', 'T040', 'T043', 'T047', 'T051', 'T085',
       'T088', 'T161', 'T219', 'T013', 'T014', 'T018', 'T030', 'T084',
       'T144', 'T160', 'T005', 'T017', 'T044', 'T095', 'T004', 'T035',
       'T036', 'T039', 'T060', 'T061', 'T062', 'T065', 'T066', 'T067',
       'T069', 'T070', 'T071', 'T073', 'T074', 'T075', 'T076', 'T077',
       'T079', 'T080', 'T081', 'T082', 'T083', 'T114', 'T155', 'T063',
       'T115', 'T138', 'T200', 'T206', 'T209', 'T212', 'T215', 'T107',
       'T128', 'T072', 'T078', 'T097', 'T099', 'T100', 'T102', 'T103',
       'T027', 'T156', 'T157', 'T221', 'T900', 'T903', 'T222', 'T223',
       'T225', 'T227', 'T224', 'T226', 'T229', 'T230', 'T228', 'T232',
       'T231', 'T235', 'T233', 'T236', 'T237', 'T238', 'T094B', 'T054C'], dtype=object)

In [61]:
# Number of mosquitos in traps with or without wnv 
ax = sns.boxplot( y="NumMosquitos",x="WnvPresent", data=train,showfliers=False)
plt.show()



In [10]:
# Given the number of mosquitos in a trap, what is the probability of finding the virus in that trap! 
number_of_mosquito_probability = train.groupby(pd.cut(train['NumMosquitos'], range(0, 50,5)), as_index = False).WnvPresent.mean()
number_of_mosquito_probability['NumMosquitos']=[5,10,15,20,25,30,35,40,45]

In [103]:
ax = sns.barplot( x="NumMosquitos",y="WnvPresent", data=number_of_mosquito_probability)
ax.set(xlabel="Probability of West Nile Virus", ylabel="Number of Mosquitoes")
plt.show()



In [63]:
# effect of mosquito type on the presence of wnv
mosquito_type = train.groupby(train['Species'], as_index = False).WnvPresent.sum()
ax = sns.barplot(x="WnvPresent", y="Species", data=mosquito_type)
ax.set(xlabel='Number of Traps')
plt.show()



In [64]:
spec1 = train[train['Species'] =='CULEX RESTUANS']
number_of_mosquito_probability = spec1.groupby(pd.cut(spec1['NumMosquitos'], range(0, 50,5)), as_index = False).WnvPresent.mean()
number_of_mosquito_probability['NumMosquitos']=[5,10,15,20,25,30,35,40,45]
sns.barplot( x="NumMosquitos",y="WnvPresent", data=number_of_mosquito_probability)
plt.show()



In [65]:
train.Species.unique()


Out[65]:
array(['CULEX PIPIENS/RESTUANS', 'CULEX RESTUANS', 'CULEX PIPIENS',
       'CULEX SALINARIUS', 'CULEX TERRITANS', 'CULEX TARSALIS',
       'CULEX ERRATICUS'], dtype=object)

In [66]:
spec1 = train[train['Species'] =='CULEX PIPIENS/RESTUANS']
number_of_mosquito_probability = spec1.groupby(pd.cut(spec1['NumMosquitos'], range(0, 50,5)), as_index = False).WnvPresent.mean()
number_of_mosquito_probability['NumMosquitos']=[5,10,15,20,25,30,35,40,45]
sns.barplot( x="NumMosquitos",y="WnvPresent", data=number_of_mosquito_probability)
plt.show()



In [67]:
spec1 = train[train['Species'] =='CULEX PIPIENS']
number_of_mosquito_probability = spec1.groupby(pd.cut(spec1['NumMosquitos'], range(0, 50,5)), as_index = False).WnvPresent.mean()
number_of_mosquito_probability['NumMosquitos']=[5,10,15,20,25,30,35,40,45]
sns.barplot( x="NumMosquitos",y="WnvPresent", data=number_of_mosquito_probability)
plt.show()



In [68]:
#This is from kaggle website as an example of using maps (https://www.kaggle.com/dchudz/predict-west-nile-virus/where-are-the-measurement-points)
#%matplotlib inline

from sklearn.neighbors import KernelDensity

mapdata = np.loadtxt("./assets/mapdata_copyright_openstreetmap_contributors.txt")
traps = train[['Date', 'Trap','Longitude', 'Latitude', 'WnvPresent']]

alpha_cm = plt.cm.Reds
alpha_cm._init()
alpha_cm._lut[:-3,-1] = abs(np.logspace(0, 1, alpha_cm.N) / 10 - 1)[::-1]
aspect = mapdata.shape[0] * 1.0 / mapdata.shape[1]
lon_lat_box = (-88, -87.5, 41.6, 42.1)

sigthings = traps[traps['WnvPresent'] > 0]
sigthings = sigthings.groupby(['Date', 'Trap','Longitude', 'Latitude']).max()['WnvPresent'].reset_index()
X = sigthings[['Longitude', 'Latitude']].values
kd = KernelDensity(bandwidth=0.02)
kd.fit(X)

xv,yv = np.meshgrid(np.linspace(-88, -87.5, 100), np.linspace(41.6, 42.1, 100))
gridpoints = np.array([xv.ravel(),yv.ravel()]).T
zv = np.exp(kd.score_samples(gridpoints).reshape(100,100))
plt.figure(figsize=(10,14))
plt.imshow(mapdata, 
           cmap=plt.get_cmap('gray'), 
           extent=lon_lat_box, 
           aspect=aspect)
plt.imshow(zv, 
           origin='lower', 
           cmap=alpha_cm, 
           extent=lon_lat_box, 
           aspect=aspect)

locations = traps[['Longitude', 'Latitude']].drop_duplicates().values
plt.scatter(locations[:,0], locations[:,1], marker='x')
plt.show()
#plt.savefig('heatmap.png')



In [69]:
#This is from kaggle website as an example of using maps (https://www.kaggle.com/dchudz/predict-west-nile-virus/where-are-the-measurement-points)
#%matplotlib inline

from sklearn.neighbors import KernelDensity

mapdata = np.loadtxt("./assets/mapdata_copyright_openstreetmap_contributors.txt")
traps = spray[['Date','Longitude', 'Latitude']]
traps1 = train[['Longitude', 'Latitude']]
alpha_cm = plt.cm.Reds
alpha_cm._init()
alpha_cm._lut[:-3,-1] = abs(np.logspace(0, 1, alpha_cm.N) / 10 - 1)[::-1]
aspect = mapdata.shape[0] * 1.0 / mapdata.shape[1]
lon_lat_box = (-88, -87.5, 41.6, 42.1)

#sigthings = traps
#sigthings = sigthings.groupby([ 'Longitude', 'Latitude']).max()['Date'].reset_index()
#X = sigthings[['Longitude', 'Latitude']].values
#kd = KernelDensity(bandwidth=0.02)
#kd.fit(X)

xv,yv = np.meshgrid(np.linspace(-88, -87.5, 100), np.linspace(41.6, 42.1, 100))
gridpoints = np.array([xv.ravel(),yv.ravel()]).T
#zv = np.exp(kd.score_samples(gridpoints).reshape(100,100))
plt.figure(figsize=(10,14))
plt.imshow(mapdata, 
           cmap=plt.get_cmap('gray'), 
           extent=lon_lat_box, 
           aspect=aspect)
#plt.imshow(zv, 
 #          origin='lower', 
    #         cmap=alpha_cm, 
    #       extent=lon_lat_box, 
     #      aspect=aspect)

locations = traps[['Longitude', 'Latitude']].drop_duplicates().values
loc =traps1[['Longitude', 'Latitude']].drop_duplicates().values
plt.scatter(locations[:,0], locations[:,1], marker='x')
plt.scatter(loc[:,0], loc[:,1], marker='o', color = 'red')
plt.show()
#plt.savefig('heatmap.png')



In [70]:
from sklearn.neighbors import KernelDensity

mapdata = np.loadtxt("./assets/mapdata_copyright_openstreetmap_contributors.txt")
traps = workingdata[['Longitude', 'Latitude']]
traps1 = data[['Longitude', 'Latitude']]
alpha_cm = plt.cm.Reds
alpha_cm._init()
alpha_cm._lut[:-3,-1] = abs(np.logspace(0, 1, alpha_cm.N) / 10 - 1)[::-1]
aspect = mapdata.shape[0] * 1.0 / mapdata.shape[1]
lon_lat_box = (-88, -87.5, 41.6, 42.1)

#sigthings = traps
#sigthings = sigthings.groupby([ 'Longitude', 'Latitude']).max()['Date'].reset_index()
#X = sigthings[['Longitude', 'Latitude']].values
#kd = KernelDensity(bandwidth=0.02)
#kd.fit(X)

xv,yv = np.meshgrid(np.linspace(-88, -87.5, 100), np.linspace(41.6, 42.1, 100))
gridpoints = np.array([xv.ravel(),yv.ravel()]).T
#zv = np.exp(kd.score_samples(gridpoints).reshape(100,100))
plt.figure(figsize=(10,14))
plt.imshow(mapdata, 
           cmap=plt.get_cmap('gray'), 
           extent=lon_lat_box, 
           aspect=aspect)
#plt.imshow(zv, 
 #          origin='lower', 
    #         cmap=alpha_cm, 
    #       extent=lon_lat_box, 
     #      aspect=aspect)

locations = traps[['Longitude', 'Latitude']].drop_duplicates().values
loc =traps1[['Longitude', 'Latitude']].drop_duplicates().values
plt.scatter(locations[:,0], locations[:,1], marker='x')
plt.scatter(loc[:,0], loc[:,1], marker='o', color = 'red')
plt.show()
#plt.savefig('heatmap.png')



In [71]:
trap = train[['Longitude', 'Latitude']].drop_duplicates()

In [72]:
workingdata= spray[['Longitude', 'Latitude']]

longi= np.round(workingdata.Longitude, 2)
lati =np.round(workingdata.Latitude, 2)
p=[]
g =[]

for i in range(len(longi)):

    p.append(float("{0:.2f}".format(longi[i])))# formatting to show only 4 decimals
    g.append(float("{0:.2f}".format(lati[i])))
longi =p
lati= g

In [73]:
trap = train[['Longitude', 'Latitude']]
trap =trap.drop_duplicates()
trap_longi= np.round(trap.Longitude, 2)
trap_lati =np.round(trap.Latitude, 2)
trap_longi =list(trap_longi)# from series to lists
trap_lati =list(trap_lati)
p=[]
g =[]
for i in range(len(trap_lati)):

    p.append(float("{0:.2f}".format(trap_longi[i])))
    g.append(float("{0:.2f}".format(trap_lati[i])))
trap_longi =p
trap_lati= g

In [74]:
f=[]
for i in range(len(trap_longi)):
    addi = 0.0
    for j in range(len(longi)):
        if (trap_longi[i] == longi[j]) and (trap_lati[i] == lati[j]):
            addi =addi +1
        else:
            addi =addi +0
    f.append(addi)

In [75]:
len(longi)


Out[75]:
14835

In [76]:
print len(f)
sum(f)


138
Out[76]:
3901.0

In [77]:
trap['sprayed'] = f

In [78]:
data =trap[trap['sprayed'] > 1.0]

In [79]:
#EDA
print(train.shape)
print(train.head())
print(train.describe(include ='all'))
print(train.isnull().sum())
print(train.dtypes)


(10506, 12)
         Date                                            Address  \
0  2007-05-29  4100 North Oak Park Avenue, Chicago, IL 60634,...   
1  2007-05-29  4100 North Oak Park Avenue, Chicago, IL 60634,...   
2  2007-05-29  6200 North Mandell Avenue, Chicago, IL 60646, USA   
3  2007-05-29    7900 West Foster Avenue, Chicago, IL 60656, USA   
4  2007-05-29    7900 West Foster Avenue, Chicago, IL 60656, USA   

                  Species  Block           Street  Trap  \
0  CULEX PIPIENS/RESTUANS     41   N OAK PARK AVE  T002   
1          CULEX RESTUANS     41   N OAK PARK AVE  T002   
2          CULEX RESTUANS     62    N MANDELL AVE  T007   
3  CULEX PIPIENS/RESTUANS     79     W FOSTER AVE  T015   
4          CULEX RESTUANS     79     W FOSTER AVE  T015   

              AddressNumberAndStreet   Latitude  Longitude  AddressAccuracy  \
0  4100  N OAK PARK AVE, Chicago, IL  41.954690 -87.800991                9   
1  4100  N OAK PARK AVE, Chicago, IL  41.954690 -87.800991                9   
2   6200  N MANDELL AVE, Chicago, IL  41.994991 -87.769279                9   
3    7900  W FOSTER AVE, Chicago, IL  41.974089 -87.824812                8   
4    7900  W FOSTER AVE, Chicago, IL  41.974089 -87.824812                8   

   NumMosquitos  WnvPresent  
0             1           0  
1             1           0  
2             1           0  
3             1           0  
4             4           0  
              Date                                            Address  \
count        10506                                              10506   
unique          95                                                138   
top     2007-08-01  ORD Terminal 5, O'Hare International Airport, ...   
freq           551                                                750   
mean           NaN                                                NaN   
std            NaN                                                NaN   
min            NaN                                                NaN   
25%            NaN                                                NaN   
50%            NaN                                                NaN   
75%            NaN                                                NaN   
max            NaN                                                NaN   

                       Species         Block            Street   Trap  \
count                    10506  10506.000000             10506  10506   
unique                       7           NaN               128    136   
top     CULEX PIPIENS/RESTUANS           NaN   W OHARE AIRPORT   T900   
freq                      4752           NaN               750    750   
mean                       NaN     35.687797               NaN    NaN   
std                        NaN     24.339468               NaN    NaN   
min                        NaN     10.000000               NaN    NaN   
25%                        NaN     12.000000               NaN    NaN   
50%                        NaN     33.000000               NaN    NaN   
75%                        NaN     52.000000               NaN    NaN   
max                        NaN     98.000000               NaN    NaN   

                    AddressNumberAndStreet      Latitude     Longitude  \
count                                10506  10506.000000  10506.000000   
unique                                 138           NaN           NaN   
top     1000  W OHARE AIRPORT, Chicago, IL           NaN           NaN   
freq                                   750           NaN           NaN   
mean                                   NaN     41.841139    -87.699908   
std                                    NaN      0.112742      0.096514   
min                                    NaN     41.644612    -87.930995   
25%                                    NaN     41.732984    -87.760070   
50%                                    NaN     41.846283    -87.694991   
75%                                    NaN     41.954690    -87.627796   
max                                    NaN     42.017430    -87.531635   

        AddressAccuracy  NumMosquitos    WnvPresent  
count      10506.000000  10506.000000  10506.000000  
unique              NaN           NaN           NaN  
top                 NaN           NaN           NaN  
freq                NaN           NaN           NaN  
mean           7.819532     12.853512      0.052446  
std            1.452921     16.133816      0.222936  
min            3.000000      1.000000      0.000000  
25%            8.000000      2.000000      0.000000  
50%            8.000000      5.000000      0.000000  
75%            9.000000     17.000000      0.000000  
max            9.000000     50.000000      1.000000  
Date                      0
Address                   0
Species                   0
Block                     0
Street                    0
Trap                      0
AddressNumberAndStreet    0
Latitude                  0
Longitude                 0
AddressAccuracy           0
NumMosquitos              0
WnvPresent                0
dtype: int64
Date                       object
Address                    object
Species                    object
Block                       int64
Street                     object
Trap                       object
AddressNumberAndStreet     object
Latitude                  float64
Longitude                 float64
AddressAccuracy             int64
NumMosquitos                int64
WnvPresent                  int64
dtype: object

In [80]:
#EDA
#M = missing data
#T = trace precipitation
print(weather.shape)
print(weather.head())
print(weather.describe(include ='all'))
print(weather.isnull().sum())
print(weather.dtypes)


(2944, 22)
   Station        Date  Tmax  Tmin Tavg Depart  DewPoint WetBulb Heat Cool  \
0        1  2007-05-01    83    50   67     14        51      56    0    2   
1        2  2007-05-01    84    52   68      M        51      57    0    3   
2        1  2007-05-02    59    42   51     -3        42      47   14    0   
3        2  2007-05-02    60    43   52      M        42      47   13    0   
4        1  2007-05-03    66    46   56      2        40      48    9    0   

     ...    CodeSum Depth Water1 SnowFall PrecipTotal StnPressure SeaLevel  \
0    ...                0      M      0.0        0.00       29.10    29.82   
1    ...                M      M        M        0.00       29.18    29.82   
2    ...         BR     0      M      0.0        0.00       29.38    30.09   
3    ...      BR HZ     M      M        M        0.00       29.44    30.08   
4    ...                0      M      0.0        0.00       29.39    30.12   

  ResultSpeed ResultDir  AvgSpeed  
0         1.7        27       9.2  
1         2.7        25       9.6  
2        13.0         4      13.4  
3        13.3         2      13.4  
4        11.7         7      11.9  

[5 rows x 22 columns]
            Station        Date         Tmax         Tmin  Tavg Depart  \
count   2944.000000        2944  2944.000000  2944.000000  2944   2944   
unique          NaN        1472          NaN          NaN    60     42   
top             NaN  2013-06-02          NaN          NaN    73      M   
freq            NaN           2          NaN          NaN   138   1472   
mean       1.500000         NaN    76.166101    57.810462   NaN    NaN   
std        0.500085         NaN    11.461970    10.381939   NaN    NaN   
min        1.000000         NaN    41.000000    29.000000   NaN    NaN   
25%        1.000000         NaN    69.000000    50.000000   NaN    NaN   
50%        1.500000         NaN    78.000000    59.000000   NaN    NaN   
75%        2.000000         NaN    85.000000    66.000000   NaN    NaN   
max        2.000000         NaN   104.000000    83.000000   NaN    NaN   

           DewPoint WetBulb  Heat  Cool    ...    CodeSum Depth Water1  \
count   2944.000000    2944  2944  2944    ...       2944  2944   2944   
unique          NaN      48    31    31    ...         98     2      1   
top             NaN      63     0     0    ...                M      M   
freq            NaN     135  1870  1147    ...       1609  1472   2944   
mean      53.457880     NaN   NaN   NaN    ...        NaN   NaN    NaN   
std       10.675181     NaN   NaN   NaN    ...        NaN   NaN    NaN   
min       22.000000     NaN   NaN   NaN    ...        NaN   NaN    NaN   
25%       46.000000     NaN   NaN   NaN    ...        NaN   NaN    NaN   
50%       54.000000     NaN   NaN   NaN    ...        NaN   NaN    NaN   
75%       62.000000     NaN   NaN   NaN    ...        NaN   NaN    NaN   
max       75.000000     NaN   NaN   NaN    ...        NaN   NaN    NaN   

       SnowFall PrecipTotal StnPressure SeaLevel  ResultSpeed    ResultDir  \
count      2944        2944        2944     2944  2944.000000  2944.000000   
unique        4         168         104      102          NaN          NaN   
top           M        0.00       29.34    30.00          NaN          NaN   
freq       1472        1577         128       96          NaN          NaN   
mean        NaN         NaN         NaN      NaN     6.960666    17.494905   
std         NaN         NaN         NaN      NaN     3.587527    10.063609   
min         NaN         NaN         NaN      NaN     0.100000     1.000000   
25%         NaN         NaN         NaN      NaN     4.300000     7.000000   
50%         NaN         NaN         NaN      NaN     6.400000    19.000000   
75%         NaN         NaN         NaN      NaN     9.200000    25.000000   
max         NaN         NaN         NaN      NaN    24.100000    36.000000   

        AvgSpeed  
count       2944  
unique       178  
top          6.9  
freq          63  
mean         NaN  
std          NaN  
min          NaN  
25%          NaN  
50%          NaN  
75%          NaN  
max          NaN  

[11 rows x 22 columns]
Station        0
Date           0
Tmax           0
Tmin           0
Tavg           0
Depart         0
DewPoint       0
WetBulb        0
Heat           0
Cool           0
Sunrise        0
Sunset         0
CodeSum        0
Depth          0
Water1         0
SnowFall       0
PrecipTotal    0
StnPressure    0
SeaLevel       0
ResultSpeed    0
ResultDir      0
AvgSpeed       0
dtype: int64
Station          int64
Date            object
Tmax             int64
Tmin             int64
Tavg            object
Depart          object
DewPoint         int64
WetBulb         object
Heat            object
Cool            object
Sunrise         object
Sunset          object
CodeSum         object
Depth           object
Water1          object
SnowFall        object
PrecipTotal     object
StnPressure     object
SeaLevel        object
ResultSpeed    float64
ResultDir        int64
AvgSpeed        object
dtype: object

In [81]:
weather.columns


Out[81]:
Index([u'Station', u'Date', u'Tmax', u'Tmin', u'Tavg', u'Depart', u'DewPoint',
       u'WetBulb', u'Heat', u'Cool', u'Sunrise', u'Sunset', u'CodeSum',
       u'Depth', u'Water1', u'SnowFall', u'PrecipTotal', u'StnPressure',
       u'SeaLevel', u'ResultSpeed', u'ResultDir', u'AvgSpeed'],
      dtype='object')

In [82]:
#only working with the data from Station No. 1 and using Tavg, total precipitation, and wind speed as variables
weather = weather[weather['Station']== 1]
weather = weather[['Date','Tavg','PrecipTotal','ResultSpeed']]

In [83]:
# Adding the weather data to the train and test data
train_add =train.join(weather.set_index('Date'), on='Date')
test_add = test.join(weather.set_index('Date'), on='Date')

In [84]:
# reformatting  Tavg to float
train_add['Tavg']=train_add.Tavg.astype(float)
# assigning 0 instead of T and reformatting PrecipTotal to float
train_add['PrecipTotal']= np.where(train_add['PrecipTotal']=='  T', '0.0', train_add['PrecipTotal'])
train_add['PrecipTotal']= train_add.PrecipTotal.astype(float)

In [100]:
# Tavg in traps with or without wnv 
ax = sns.boxplot( y="Tavg",x="WnvPresent", data=train_add,showfliers=False)
ax.set(xlabel="Average Temperature", ylabel="West Nile Virus Present")
plt.show()



In [98]:
ax = sns.boxplot( y="PrecipTotal",x="WnvPresent", data=train_add, showfliers=False)
ax.set(xlabel="Precipitation Total", ylabel="West Nile Virus Present")
plt.show()



In [150]:
ax = sn.boxplot( y="ResultSpeed",x="WnvPresent", data=train_add)
plt.show()


Seems like wind spped, precipitation, and average temperature do not have a huge effect on wnv presence. However, location, number of mosquitos in the trap and the type of mosquitos have bigger effect.


In [248]:
from datetime import datetime
train['Date'] = pd.to_datetime(train['Date'], format='%Y-%m-%d')
test['Date'] = pd.to_datetime(test['Date'], format='%Y-%m-%d')
weather['Date'] = pd.to_datetime(weather['Date'], format='%Y-%m-%d')
spray['Date'] = pd.to_datetime(spray['Date'], format='%Y-%m-%d')

In [249]:
train['week'] = train['Date'].dt.weekofyear
train['year'] = train['Date'].dt.year
test['week'] = test['Date'].dt.weekofyear
test['year'] = test['Date'].dt.year
spray['year']=spray['Date'].dt.year
weather['year']=weather['Date'].dt.year

In [78]:
print test.columns
train.columns


Index([u'Id', u'Date', u'Address', u'Species', u'Block', u'Street', u'Trap',
       u'AddressNumberAndStreet', u'Latitude', u'Longitude',
       u'AddressAccuracy', u'week', u'year'],
      dtype='object')
Out[78]:
Index([u'Date', u'Address', u'Species', u'Block', u'Street', u'Trap',
       u'AddressNumberAndStreet', u'Latitude', u'Longitude',
       u'AddressAccuracy', u'NumMosquitos', u'WnvPresent', u'week', u'year'],
      dtype='object')

In [6]:
weeks = train.groupby(['year','week'], as_index=False).WnvPresent.sum()

In [80]:
test.head()


Out[80]:
Id Date Address Species Block Street Trap AddressNumberAndStreet Latitude Longitude AddressAccuracy week year
0 1 2008-06-11 4100 North Oak Park Avenue, Chicago, IL 60634,... CULEX PIPIENS/RESTUANS 41 N OAK PARK AVE T002 4100 N OAK PARK AVE, Chicago, IL 41.95469 -87.800991 9 24 2008
1 2 2008-06-11 4100 North Oak Park Avenue, Chicago, IL 60634,... CULEX RESTUANS 41 N OAK PARK AVE T002 4100 N OAK PARK AVE, Chicago, IL 41.95469 -87.800991 9 24 2008
2 3 2008-06-11 4100 North Oak Park Avenue, Chicago, IL 60634,... CULEX PIPIENS 41 N OAK PARK AVE T002 4100 N OAK PARK AVE, Chicago, IL 41.95469 -87.800991 9 24 2008
3 4 2008-06-11 4100 North Oak Park Avenue, Chicago, IL 60634,... CULEX SALINARIUS 41 N OAK PARK AVE T002 4100 N OAK PARK AVE, Chicago, IL 41.95469 -87.800991 9 24 2008
4 5 2008-06-11 4100 North Oak Park Avenue, Chicago, IL 60634,... CULEX TERRITANS 41 N OAK PARK AVE T002 4100 N OAK PARK AVE, Chicago, IL 41.95469 -87.800991 9 24 2008

In [71]:
weeks.dtypes


Out[71]:
year          int64
week          int64
WnvPresent    int64
index         int64
dtype: object

In [79]:
import seaborn as sns
#weeks['Datetime'] = df.index
ax=sns.lmplot(data = weeks, x='week',  y='WnvPresent',hue='year', fit_reg=False, palette="Set1",markers= '_' )
#plt.plot( weeks['week'],  weeks['WnvPresent'] , color = weeks['year'])
plt.show()



In [45]:
weeks['index'] =weeks.index
weeks.head()


Out[45]:
year week WnvPresent index
0 2007 22 0 0
1 2007 23 0 1
2 2007 26 0 2
3 2007 27 0 3
4 2007 28 0 4

In [307]:
weeks.year.unique()


Out[307]:
array([2007, 2009, 2011, 2013])

In [309]:
fig = plt.figure(figsize=(7,4))
ax = fig.gca()
    
#ax = sns.distplot(ax=ax)
#proposal_xrange = np.linspace(np.min(data)-2, np.max(data)+2, 200)
ax.plot(weeks[weeks['year']== 2007].week, weeks[weeks['year']== 2007].WnvPresent , lw=2, color='darkred', alpha=0.1,
            label='2007')
    
ax.plot(weeks[weeks['year']== 2009].week, weeks[weeks['year']== 2009].WnvPresent , lw=2, color='darkred', alpha=0.3,
            label='2009')
ax.plot(weeks[weeks['year']== 2011].week, weeks[weeks['year']== 2011].WnvPresent , lw=2, color='darkred', alpha=0.5,
            label='2011 ')
ax.plot(weeks[weeks['year']== 2013].week, weeks[weeks['year']== 2013].WnvPresent , lw=2, color='darkred', alpha=1.0,
            label='2013 ')
    
ax.legend(loc='upper left')
plt.show()



In [312]:
w30 =pd.read_csv('weather_ave_30.csv')

In [313]:
w30.head()


Out[313]:
Date Tmax Tmin Tavg Depart DewPoint WetBulb Heat Cool PrecipTotal ... SQ FG+ MI TS DZ RA BR FG SN Datetime_Date
0 2007-05-01 83.0 50.0 67.0 14.0 51.000000 56.000000 0.0 2.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.00 0.000000 0.0 0.0 2007-05-01
2 2007-05-02 83.0 42.0 59.0 -3.0 46.500000 51.500000 14.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.00 0.500000 0.0 0.0 2007-05-02
4 2007-05-03 83.0 42.0 58.0 2.0 44.333333 50.333333 14.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.00 0.333333 0.0 0.0 2007-05-03
6 2007-05-04 83.0 42.0 58.0 4.0 43.500000 50.250000 14.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.25 0.250000 0.0 0.0 2007-05-04
8 2007-05-05 83.0 42.0 58.4 5.0 42.400000 50.000000 14.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.20 0.200000 0.0 0.0 2007-05-05

5 rows × 29 columns


In [ ]:

Models


In [ ]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.cross_validation import cross_val_score, StratifiedKFold ,train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

In [ ]:
# model evaluation function
def evaluate_model(model):
    model.fit(X, y)
    y_pred = model.predict(X_test)
    
    a = accuracy_score(y_test, y_pred)
    probabilities = model.predict_proba(X_test)
    #cm = confusion_matrix(y_test, y_pred)
    conmat = np.array(confusion_matrix(y_test, y_pred, labels=[1,0]))

    confusion = pd.DataFrame(conmat, index=['1', '0'],
                         columns=['predicted_1','predicted_0'])

    cr = classification_report(y_test, y_pred)
    
    print 'confusion matrix:',confusion
    print 'classification_report:',cr
    print 'Accuracy of the model on test:',a
    return probabilities

Random Forest


In [ ]:
X=train['week','year',]

In [ ]:
##Grid search over Random Forest parameters
params = {'max_features ': [0.1,.5,1.0],'max_depth':[0.1,0.5,1.0],'n_estimators':[3, 5, 10, 50]}

gsrf = GridSearchCV(RandomForestClassifier(random_state = 33),
                     params, n_jobs=-1,
                     cv=StratifiedKFold(len(y), n_folds=3, shuffle=True))
gsrf.fit(X, y)
print 'best parameters for the model:',gsrf.best_params_
print 'best score on train:',gsrf.best_score_
probability = evaluate_model(gsrf.best_estimator_)
#print 'model score on test:',score

KNN


In [ ]:
# Grid search over KNN parameters
params = {'n_neighbors': range(2,60)}

gsknn = GridSearchCV(KNeighborsClassifier(),
                     params, n_jobs=-1,
                     cv=StratifiedKFold(len(y), n_folds=3, shuffle=True))
gsknn.fit(X, y)
print 'best parameters for the model:',gsknn.best_params_
print 'best score on train:',gsknn.best_score_
probability = evaluate_model(gsknn.best_estimator_)
#print 'model score on test:',score

Logestic Regression


In [ ]:
# Grid search over logestic regression parameters
params = {'penalty': ['l2','l1'], 'C':[0.1,1.0,10]}

gslr = GridSearchCV(LogisticRegression(solver = 'sag',random_state = 33),
                     params, n_jobs=-1,
                     cv=StratifiedKFold(len(y), n_folds=3, shuffle=True))
gslr.fit(X, y)
print 'best parameters for the model:',gslr.best_params_
print 'best score on train:',gslr.best_score_
probability = evaluate_model(gslr.best_estimator_)
#print 'model score on test:',score

SVM


In [ ]:
scaled_X =StandardScaler(X)
scaled_X_test =StandardScaler(X_test)
# Grid search over support vector machine parameters
params = { 'C': [0.01, 0.1, 1.0, 10.0, 30.0, 100.0],
          'gamma': ['auto', 0.1, 1.0, 10.0],
          'kernel': ['linear', 'rbf']}

gssvc = GridSearchCV(SVC(random_state = 33),
                     params, n_jobs=-1,
                     cv=StratifiedKFold(len(y), n_folds=3, shuffle=True))
gssvc.fit(scaled_X, y)
print 'best parameters for the model:',gssvc.best_params_
print 'best score on train:',gssvc.best_score_
probability = evaluate_model(gssvc.best_estimator_)#!!!!!!! check to used scaled vectors!
#print 'model score on test:',score

Gradiant Boosting


In [ ]:
##Grid search over Random Forest parameters
params = {'max_features ': [0.1,.5,1.0],'max_depth':[0.1,0.5,1.0],'n_estimators':[3, 5, 10, 50]}

gsgb = GridSearchCV(GradientBoostingClassifier(random_state = 33),
                     params, n_jobs=-1,
                     cv=StratifiedKFold(len(y), n_folds=3, shuffle=True))
gsgb.fit(X, y)

print 'best parameters for the model:',gsgb.best_params_
print 'best score on train:',gsgb.best_score_
probability = evaluate_model(gsgb.best_estimator_)
#print 'model score on test:',score

In [ ]:
### cross val score???

In [ ]:
#AUC 
y_score=[]
for i in range(0,len(probability)):
    y_score.append(probability[i][1])


FPR = dict()
TPR = dict()
ROC_AUC = dict()

# For class 1, find the area under the curve
FPR[1], TPR[1], _ = roc_curve(y_test, y_score)
ROC_AUC[1] = auc(FPR[1], TPR[1])

# Plot of a ROC curve for class 1 
plt.figure(figsize=[5,5])
plt.plot(FPR[1], TPR[1], label='ROC curve (area = %0.2f)' % ROC_AUC[1], linewidth=4)
plt.plot([0, 1], [0, 1], 'k--', linewidth=4)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('Receiver operating characteristic for salary ', fontsize=12)
plt.legend(loc="lower right")
plt.savefig('ROC_AUC.png')
plt.show()